library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library(ggrepel)
library(showtext)
## Loading required package: sysfonts
## Loading required package: showtextdb
font_add_google("Lato", "lato")
showtext_auto()
babynames <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-03-22/babynames.csv')
## Rows: 1924665 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sex, name
## dbl (3): year, n, prop
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
babynames_letter <- babynames %>% 
  mutate(name = tolower(name)) %>% 
  group_by(year, sex, name, n) %>% 
  summarise(letter = unlist(str_split(name, '')))
## `summarise()` has grouped output by 'year', 'sex', 'name', 'n'. You can override using the `.groups` argument.
babynames_letter %>% 
  group_by(sex, letter) %>% 
  summarise(
    n = sum(n)
  ) %>% 
  group_by(sex) %>% 
  mutate(
    prop = n/sum(n)
  ) %>% 
  ggplot() +
  geom_col(aes(letter, prop, fill = sex), position = 'dodge')
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.

plot_dat <- babynames_letter %>% 
  mutate(sex = ifelse(sex == 'F', 'Female', 'Male')) %>% 
  group_by(year, sex, letter) %>% 
  summarise(
    n = sum(n)
  ) %>% 
  group_by(year, sex) %>% 
  mutate(
    prop = n/sum(n)
  ) %>% 
  filter((letter %in% c('a', 'e', 'i', 'o', 'u')))
## `summarise()` has grouped output by 'year', 'sex'. You can override using the `.groups` argument.
lab_let <- plot_dat %>% 
  group_by(sex, letter) %>% 
  filter(year == max(year) | year == min(year)) %>% 
  mutate(year = ifelse(year == max(year), year+2, year-2))

plot_dat %>% 
  ggplot()  +
  geom_line(aes(year, prop, color = sex, group = interaction(sex, letter))) +
  geom_text(aes(year, prop, color = sex, group = letter, label = letter),
            data = lab_let) +
  scale_y_sqrt() +
  theme_few() 

set.seed(1990)

plot_dat <- babynames_letter %>% 
  mutate(sex = ifelse(sex == 'F', 'Female', 'Male')) %>% 
  group_by(year, sex, letter) %>% 
  summarise(
    n = sum(n)
  ) %>% 
  group_by(year, sex) %>% 
  mutate(
    prop = n/sum(n)
  ) %>% 
  # filter(!(letter %in% c('a', 'e', 'i', 'o', 'u'))) %>% 
  mutate(letter = letter %>% toupper()) %>% 
  ungroup() %>% 
  mutate(letter = factor(letter, sample(LETTERS)))
## `summarise()` has grouped output by 'year', 'sex'. You can override using the `.groups` argument.
lab_let <- plot_dat %>% 
  group_by(sex, letter) %>% 
  filter(year == max(year) | year == min(year)) %>% 
  mutate(year = ifelse(year == max(year), year, year))

plot_dat %>% 
  ggplot()  +
  geom_line(aes(year, prop, color = letter,
                group = interaction(sex, letter))) +
  geom_text_repel(aes(year, prop, color = letter, label = letter),
            data = filter(lab_let, year == max(year)), 
            direction = "y", hjust = "left", nudge_x = 30,
            max.overlaps = Inf, min.segment.length = 0,
            segment.color = 'black', seed = 0, size = 4) +
  geom_text_repel(aes(year, prop, color = letter, label = letter),
            data = filter(lab_let, year == min(year)), 
            direction = "y", hjust = "left", nudge_x = -30,
            max.overlaps = Inf, min.segment.length = 0,
            segment.color = 'black', seed = 1, size = 4) +
  scale_y_sqrt(
    breaks = c(0.001, 0.007, seq(0.02, 0.2, 0.02)),
    sec.axis = dup_axis()
  ) +
  scale_x_continuous(
    expand = expansion(mult = 0.3),
    breaks = seq(1880, 2020, 20)
  ) +
  theme_few() +
  facet_wrap(~sex) +
  theme(
    legend.position = 'none'  
  )+
  ylab('Proportion') +
  xlab('Year') +
  labs(
    title = 'The popularity of the letters used in newborn baby names has changed over the years',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  )+
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 

plt_fun <- function(highlight) {
  
  plot_dat %>% 
    ggplot()  +
    geom_line(aes(year, prop, color = letter, 
                  size = letter %in% highlight,
                  alpha = letter %in% highlight,
                  group = interaction(sex, letter))) +
    geom_text_repel(aes(year, prop, color = letter, label = letter,
                        alpha = letter %in% highlight),
              data = filter(lab_let, year == max(year)), 
              direction = "y", hjust = "left", nudge_x = 30,
              max.overlaps = Inf, min.segment.length = 0,
              segment.color = 'black', seed = 0, size = 4
              ) +
    geom_text_repel(aes(year, prop, color = letter, label = letter,
                        alpha = letter %in% highlight),
              data = filter(lab_let, year == min(year)), 
              direction = "y", hjust = "left", nudge_x = -30,
              max.overlaps = Inf, min.segment.length = 0,
              segment.color = 'black', seed = 1, size = 4) +
    scale_y_sqrt(
      breaks = c(0, 0.001, 0.007, seq(0.02, 0.2, 0.02)),
      sec.axis = dup_axis()
    ) +
    scale_x_continuous(
      expand = expansion(mult = 0.3),
      breaks = seq(1880, 2020, 20)
    ) +
    theme_few() +
    facet_wrap(~sex) +
    theme(
      legend.position = 'none'  
    ) +
    scale_size_discrete(range = c(0.5, 1)) +
    scale_alpha_discrete(range = c(0.3, 1)) +
    ylab('Proportion') +
    xlab('Year') 
}
plt_fun(c('A', 'E', 'I', 'O', 'U')) +
  labs(
    title = 'The vowels',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

babynames %>% 
  filter(year == '1960', sex == 'M', str_detect(tolower(name), 'f')) %>% 
  mutate(sum = sum(prop))
## # A tibble: 263 × 6
##     year sex   name          n     prop    sum
##    <dbl> <chr> <chr>     <dbl>    <dbl>  <dbl>
##  1  1960 M     Jeffrey   28831 0.0133   0.0418
##  2  1960 M     Frank     10759 0.00497  0.0418
##  3  1960 M     Jeff       8509 0.00393  0.0418
##  4  1960 M     Jeffery    7656 0.00354  0.0418
##  5  1960 M     Frederick  3478 0.00161  0.0418
##  6  1960 M     Fred       3398 0.00157  0.0418
##  7  1960 M     Clifford   2465 0.00114  0.0418
##  8  1960 M     Francis    2421 0.00112  0.0418
##  9  1960 M     Alfred     2405 0.00111  0.0418
## 10  1960 M     Franklin   1534 0.000708 0.0418
## # … with 253 more rows
babynames %>% 
  filter(year == '1960', sex == 'M') 
## # A tibble: 4,590 × 5
##     year sex   name        n   prop
##    <dbl> <chr> <chr>   <dbl>  <dbl>
##  1  1960 M     David   85928 0.0397
##  2  1960 M     Michael 84183 0.0389
##  3  1960 M     James   76842 0.0355
##  4  1960 M     John    76096 0.0351
##  5  1960 M     Robert  72369 0.0334
##  6  1960 M     Mark    58731 0.0271
##  7  1960 M     William 49354 0.0228
##  8  1960 M     Richard 43561 0.0201
##  9  1960 M     Thomas  39279 0.0181
## 10  1960 M     Steven  33895 0.0157
## # … with 4,580 more rows
plt_fun(c('F', 'S', 'O')) +
  labs(
    title = 'The rise and fall of letters',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

plt_fun(c('R', 'W')) +
  labs(
    title = 'Decreasing trends',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.

plt_fun(c('K', 'X', 'N')) +
  labs(
    title = 'The newbies',
    caption = '#TidyTuesday (2022, w12) | Data: babynames R package | Graph by @irg_bio'
  ) +
    theme(
      text = element_text(family = 'lato'),
      axis.text = element_text(size = 12),
      plot.title = element_text(hjust = 0.5)
    ) 
## Warning: Using size for a discrete variable is not advised.
## Warning: Using alpha for a discrete variable is not advised.